#coding:utf8
import jieba
import pickle 
import pandas as pd
import os

config_dir = os.path.dirname(os.path.dirname(os.getcwd()))+'\\config\\'
  
# 创建停用词list  
def stopwordslist(filepath):  
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf8').readlines()]  
    return stopwords  
 
 
# 对句子进行分词  
def seg_sentence(sentence):  
    sentence_seged = jieba.cut(sentence.strip())  
    stopwords = stopwordslist(config_dir+'stopWord.txt')  # 这里加载停用词的路径  
    outstr = ''  
    for word in sentence_seged:  
        if word not in stopwords:  
            if word != '\t':  
                outstr += word  
                outstr += " "  
    return outstr  

def fenci(openfilename): 
    words = []
    inputs = pd.read_csv(openfilename) 
      
    for line in inputs['content']:  
        line_seg = seg_sentence(line).split()  # 这里的返回值是列表
        words.append(line_seg)
    # print(line_seg)
    savefilename1 = openfilename.split('.')[0:-1]
    savefilename = savefilename1[0] + '.pkl' 
    one = open(savefilename,'wb')  
    pickle.dump(words,one) 
    one.close()

